Let us first start by installing and loading the packages we will use during this whole assignment.
i.p <- (\(x) is.null(install.packages(x, repos = "https://cran.rediris.es")))
# gsub on files
require(xfun) || i.p("xfun") && require(xfun)
# Tibbles and plots
require(tidyverse) || i.p("tidyverse") && require(tidyverse)
require(patchwork) || i.p("patchwork") && require(patchwork)
# Tsibbles and models for them
require(tsibble) || i.p("tsibble") && require(tsibble)
require(feasts) || i.p("feasts") && require(feasts)
require(fable) || i.p("fable") && require(fable)
require(fable.prophet) || i.p("fable.prophet") && require(fable.prophet)
require(fabletools) || i.p("fabletools") && require(fabletools)
# Machine Learning
require(modeltime) || i.p("modeltime") && require(modeltime)
require(timetk) || i.p("timetk") && require(timetk)
require(rsample) || i.p("rsample") && require(rsample)
require(parsnip) || i.p("parsnip") && require(parsnip)
# Plot theme
theme_set(theme_minimal())
# English for dates
Sys.setlocale(locale = "en_GB.UTF-8")
# Useful functions
most_freq <- (\(x) as.numeric(names(which.max(table(x)))))
zip <- function(...) {
mapply(list, ..., SIMPLIFY = FALSE)
}
enumerate <- function(...) {
zip(ix = seq_along(..1), ...)
}
# Plot functions
colors <- c("CO" = "#CD3333",
"C6H6" = "#EE7621",
"NOy" = "#EEC900",
"NO2" = "#43CD80",
"Temp" = "#009ACD",
"TempM" = "#8B008B")
pretty_ts <- function(.data, .var, title, type = "histogram", lag_max = NULL) {
norm <- sqrt(nrow(.data))
p1 <- .data %>%
autoplot(.data[[.var]], color = colors[.var]) +
labs(title = title, x = "", y = "")
p2 <- .data %>%
ACF(.data[[.var]], lag_max = lag_max) %>%
ggplot(aes(x = lag, y = acf)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0),
color = colors[.var], alpha = 0.75) +
geom_hline(aes(yintercept = 1.96/norm),
linetype = 2, color = colors[.var]) +
geom_hline(aes(yintercept = -1.96/norm),
linetype = 2, color = colors[.var]) +
labs(title = "", x = "", y = "ACF")
if (type == "histogram") {
p3 <- .data %>%
ggplot(aes(x = .data[[.var]])) +
geom_histogram(aes(y = after_stat(density)),
color = "white", fill = colors[.var], alpha = 0.25) +
geom_density(lwd = 1, color = colors[.var]) +
labs(title = "", x = "", y = "Density")
} else {
p3 <- .data %>%
PACF(.data[[.var]], lag_max = lag_max) %>%
ggplot(aes(x = lag, y = pacf)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0),
color = colors[.var], alpha = 0.75) +
geom_hline(aes(yintercept = 1.96/norm),
linetype = 2, color = colors[.var]) +
geom_hline(aes(yintercept = -1.96/norm),
linetype = 2, color = colors[.var]) +
labs(title = "", x = "", y = "PACF")
}
print(p1 / (p2 | p3))
}
pretty_resid <- function(.data, .var, title, lag_max = NULL) {
norm <- sqrt(nrow(.data))
p1 <- .data %>%
autoplot(.data[[".resid"]], color = colors[.var]) +
labs(title = title, x = "", y = "")
p2 <- .data %>%
ACF(.data[[".resid"]], lag_max = lag_max) %>%
ggplot(aes(x = lag, y = acf)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0),
color = colors[.var], alpha = 0.75) +
geom_hline(aes(yintercept = 1.96/norm),
linetype = 2, color = colors[.var]) +
geom_hline(aes(yintercept = -1.96/norm),
linetype = 2, color = colors[.var]) +
labs(title = "", x = "", y = "ACF")
p3 <- .data %>%
ggplot(aes(x = .data[[".resid"]])) +
geom_histogram(aes(y = after_stat(density)),
color = "white", fill = colors[.var], alpha = 0.25) +
geom_density(lwd = 1, color = colors[.var]) +
labs(title = "", x = "", y = "Density")
print(p1 / (p2 | p3))
}
compare_box_cox <- function(.data, .var, title) {
.data$bc <- box_cox_vec(.data[[.var]], silent = TRUE)
stl_nm <- stl(as.ts(.data)[, .var], 24)$time.series[, "trend"] %>% as_tibble()
stl_bc <- stl(as.ts(.data)[, "bc"], 24)$time.series[, "trend"] %>% as_tibble()
stl_nm$index <- .data$Date
stl_bc$index <- .data$Date
stl_nm <- stl_nm %>% as_tsibble(index = index)
stl_bc <- stl_bc %>% as_tsibble(index = index)
p1 <- .data %>%
autoplot(.data[[.var]], color = colors[.var], alpha = 0.25) +
autolayer(stl_nm, x, color = colors[.var]) +
labs(title = title, subtitle = "No transformation applied", x = "", y = "")
p2 <- .data %>%
autoplot(.data[["bc"]], color = colors[.var], alpha = 0.25) +
autolayer(stl_bc, x, color = colors[.var]) +
labs(title = "", subtitle = "Box-Cox transformed", x = "", y = "")
print(p1 | p2)
}
This data records the temperature, as well as the concentration in air of different polluting agents at road level in an Italian city. The data is taken from UCI Machine Learning Repository.
Our intent with this data, which is hourly recorded during one year, is to forecast the concentration of those same gases for the next six months and use it to predict the effect on the temperature these agents may have.
The code below will fetch the data and extract it for us. It will also apply basic transformations to help the code parse the input by transforming the fields to the CSV standard.
download_url <- paste0("https://archive.ics.uci.edu/ml/",
"machine-learning-databases/00360/AirQualityUCI.zip")
if (!file.exists("AirQualityUCI.zip")) {
download.file(download_url, "./AirQualityUCI.zip")
}
if (!file.exists("AirQualityUCI.csv")) {
unzip("AirQualityUCI.zip")
gsub_file("AirQualityUCI.csv", ",", ".")
gsub_file("AirQualityUCI.csv", ";;", "")
gsub_file("AirQualityUCI.csv", ";", ",")
}
data <- read_csv("AirQualityUCI.csv", show_col_types = FALSE)
The first thing we need is to create a variable which will act as our
temporal index for the time series. This will be comprised of the
Date and Time variables which we will join and
transform to match the “UTC+1” 1 2 format. We will also take the opportunity
and incorporate into the pipeline the replacement of -200
as NA, just as specified in the webpage where the dataset
was taken. These NA will later be inputted, but not yet.
Finally, we will transform into a tsibble object for the
future analysis and visualization.
data_ts <- data %>%
add_column(Datetime = as.POSIXct(paste(data$Date, data$Time),
format = "%d/%m/%Y %H.%M.%S",
tz = "GMT"), .before = 1) %>%
select(!c(Date, Time)) %>%
replace(. == -200, NA) %>%
as_tsibble(index = Datetime)
Now we select what are the time series we are interested in. We refer to the information of the different time series present in the data and pick those which are taken from “reference analyser” along with the temperature. These will be the time series we will focus for the rest of the assignment.
data_ts <- data_ts %>%
select(c("Datetime", "T", ends_with("(GT)")))
Now we look at the distribution of the missing values. We note that
for the variable NMHC(GT) the huge majority of the data are
in reality NAs. Hence we decide to remove the aforementioned variable
and keep the rest which have a count of NAs which we may handle.
data_ts %>% is.na() %>% colSums()
## Datetime T CO(GT) NMHC(GT) C6H6(GT) NOx(GT) NO2(GT)
## 0 366 1683 8443 366 1639 1642
data_ts <- data_ts %>% select(!"NMHC(GT)")
Lastly, for convenience, we rename the columns to get rid of the
(GT) which was used to indicate reference analyser
variables.
colnames(data_ts) <- gsub("[(]GT[)]", "", colnames(data_ts))
colnames(data_ts)[1] <- "Date"
colnames(data_ts)[2] <- "Temp"
Let us visualize all the considered variables first. For the temperature we appreciate the oscillatory behaviour expected from an annual period. If look at the ACF, we have a revealed daily seasonality that make sense from temperature measurements.
data_ts %>% pretty_ts("Temp", title = "Temperature [°C]")
For the CO concentrations we have a different picture. First, we can note significantly the presence of missing values in the time series plot. Given that these come in patches, it is easy to suppose that they may come from sensor shutdown at concrete time periods. It is also worth noting a decrease of CO concentration in summer. This is probably due to the absence of heating devices given the warmer weather in the analysed region.
Looking at the ACF we observe a semi-daily period, which may come from the fact that heating devices, one of the main CO emitters, are switched with a usually semi-daily period, namely during the morning and late afternoon. Nonetheless, its seasonality appears to be weaker than in the case of the temperature.
data_ts %>% pretty_ts("CO", title = "CO [mg/m³]")
Benzene (\(C_6H_6\)) emit channels are a subset of those for carbon monoxide. This is appreciated when looking at the plots which exhibit a very similar behaviour for the time series. However, in the ACF we note that the seasonality is now only daily and with a much smaller trend component. Our intuition is that the reduced emit channels are the cause of this difference.
data_ts %>% pretty_ts("C6H6", title = "Benzene [μg/m³]")
Nitrogen oxides are another family of polluting agents. Here we have two columns which refer to them: \(NO_x\) and \(NO_2\). The first incorporates the effect of all nitrogen oxides. It is thus interesting to get rid of the \(NO_2\) contribution in the \(NO_x\) column in order to decouple both variables. We do so by using the formula conversion between ppb and μg/m³, \[\begin{equation*} C(\mathrm{ppb}) = C(\mu\mathrm{g/m}^3) \cdot \frac{V(\mathrm{air})}{M(NO_2)} \;, \end{equation*}\] where \(M(NO_2)\) refers to the molecular mass of nitrogen dioxide, 46.006 g/mol, and \(V(\mathrm{air})\) is calculated using Gay-Lussac’s law considering approximating the air pressure as a constant and taking the volume to be \(24.45\) litres at \(25\) degree Celsius.
data_ts$NOy <- data_ts$NOx - 24.45/46.006*data_ts$NO2 * (273.25 + data_ts$Temp)/298.25
# Delete pathological cases
data_ts$NOy[data_ts$NOy < 0] <- NA
We have renamed this, now \(NO_2\) stripped, variable to \(NO_y\) for convenience. It is interesting to note the sudden increase that happens at the end of the summer which does not seem to disappear on the following year.
Its seasonality, present in the ACF, is both, semi-daily and daily, with the latter being more prominent. As an explanation, we may track car pollution, which operate on the same frequency, e.g. workers going and coming back from work by car or bus.
data_ts %>% pretty_ts("NOy", title = "Nitrogen Oxides [ppb]")
The \(NO_2\) exhibits a very different pattern, justifying the prior decoupling we made. For once, it does not appear to have sudden changes when in different year epochs like the rest of gases, with a concentration that, at first glance, resembles a kind of stationary signal.
Moreover, the seasonality appears to be strictly daily, with a very fast ACF decrease, which indicates absence of trend.
data_ts %>% pretty_ts("NO2", title = "Nitrogen Dioxide [μg/m³]")
In this section we will attempt to forecast both, the gases as well as the temperature assuming a relation between them. The target will be a 6-month forecast. First, we will need to find suitable models for each of the gases and forecast them into the future. Then, we will use the predictions to construct a model in which the temperature may be estimated with a time series model and the forecasts of all the gases. There, we expect to learn which of the gases have a bigger impact on the whether.
Given that we are treating four time series, one for each polluting agent, we ought to have an unified and organised schema of the steps for the correct modelling and forecast. Let us detail the recipe we will attempt to follow for the following part:
For each of the agents we will see if a transformation would be fit to help and normalise the distribution of the values. We can note that in general we have very right-skewed distributions which may benefit from a low-\(\lambda\) Box-Cox transformation.
We will implement five models for each agent: an automatic ARIMA, a manual ARIMA, an automatic ETS, a Prophet model and an automatic TSLM with a various Fourier components which account for all possible periodicities. For the manual ARIMA we will proceed by differentiating to white-noise and interpret the ACF and PACF.
Each of these models will be tested using 3-fold cross-validation. Using the RMSE we will extract which model performed better. Then, we will select the model which performed better on the majority of the folds.
The selected model will be latter be used to forecast the next 6-month concentrations of the different polluting agents.
Before starting the process, let us divide the data set in a
training, on which we will make decisions and apply cross-validation,
and a test partition, to get an estimation of the performance we may
expect. This split will be made giving the training set 5 times more
data than to the test one. For the cross-validation we will consider 7
months as training part and 1 more for testing. Note that
cross-validation splits are taken from the train partition. We also take
here the chance to input all NA data as they make some of
the models invalid. These are imputed with the value just before
them.
# Input missing values
data_ts <- data_ts %>% fill(everything())
# Duplicate Temp variable for later convenience
data_ts <- data_ts %>% mutate(TempM = Temp)
# Train-test split and CV splits
traintest <- data_ts %>%
as_tibble() %>%
time_series_split(date_var = Date, initial = "10 months", assess = "2 months")
cvsplits <- training(traintest) %>%
time_series_cv(date_var = Date,
initial = "7 months",
assess = "1 month",
skip = "1 month",
slice_limit = 3,
cumulative = FALSE)
# Convert to tsibble for convenience
train_ts <- training(traintest) %>% as_tsibble(index = Date)
test_ts <- testing(traintest) %>% as_tsibble(index = Date)
First of all, we are going to consider a Box-Cox transformation to get a more normal distribution of the values, reducing skewness and heteroskedasticity. By looking at the following plots it does indeed appear that this transformation helps in getting a time series with which we may work more easily.
train_ts %>% compare_box_cox("CO", "CO atmosferic concentration [mg/m³]")
Now we need to examine the time series to create an appropriate ARIMA model. For this purpose, we are going to apply a 24-hour differentiation, which we decide upon by looking at the next graph. Then, we check and see using unitroot tests that a normal difference is not needed for the time series to reach a white-noise behaviour.
train_ts %>%
mutate(CO = box_cox_vec(CO, silent = TRUE)) %>%
pretty_ts("CO", "Box-Cox transformed CO", type = "partial", lag_max = 72)
# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>%
mutate(BC_CO = box_cox_vec(CO, silent = TRUE),
dBC_CO = difference(BC_CO, 24)) %>%
features(dBC_CO, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
# Difference and see if stationary
train_ts %>%
mutate(CO = difference(box_cox_vec(CO, silent = TRUE), 24)) %>%
pretty_ts("CO", "Diff_24 Box-Cox transformed CO", type = "partial", lag_max = 72)
Looking at the ACF and PACF we decide that an \(ARIMA(3,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.
# Look at PACF => p = 3
# Look at PACF_24 => P = 3
for (sp in enumerate(cvsplits$splits)) {
trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
tstdat <- testing(sp[[2]]) %>% as_tsibble(index = Date)
lam <- auto_lambda(trndat$CO)
models_CO <- trndat %>%
model(
manual_arima = ARIMA(box_cox(CO, lam) ~ pdq(3, 0, 0) + PDQ(3, 1, 0, period = 24)),
auto_arima = ARIMA(box_cox(CO, lam) ~ PDQ(period = 24)),
prophet = prophet(box_cox(CO, lam) ~ season(period = 24)),
auto_ets = ETS(box_cox(CO, lam) ~ season(period = 24)),
auto_tslm = TSLM(box_cox(CO, lam) ~ trend() +
fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
print(paste("The best model for split", sp$ix, "is",
models_CO$.model[which.min(models_CO$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is prophet"
## [1] "The best model for split 3 is prophet"
We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. We also visualize its performance on the test partition which captures the tendency and with the confidence intervals offers a good approximation on the range where real values are measured. It fails nevertheless to reproduce the correct oscillatory movement.
lam_CO <- auto_lambda(train_ts$CO)
model_CO <- train_ts %>%
model(prophet(box_cox(CO, lam_CO) ~ season(period = 24)))
model_CO %>%
residuals() %>%
pretty_resid("CO", title = "Residuals for Carbon Monoxide [mg/m³]")
train_ts %>%
filter(year(Date) == 2005) %>%
autoplot(CO) +
autolayer(forecast(model_CO, test_ts), CO, color = colors["CO"]) +
autolayer(test_ts, CO, color = "#838B8B", alpha = 0.75) +
labs(x = "", y = "Carbon Monoxide [mg/m³]")
As with CO, we are going to consider a Box-Cox transformation for the same reasons. By looking at the following plots it does indeed appear that this transformation helps in getting a time series with which we may work more easily.
train_ts %>% compare_box_cox("C6H6", "Benzene atmosferic concentration [μg/m³]")
Looking at previous ACF, we are going to apply a 24-hour differentiation. Then, we check and see using unitroot tests that a normal difference is not needed for the time series to reach a white-noise behaviour.
train_ts %>%
mutate(C6H6 = box_cox_vec(C6H6, silent = TRUE)) %>%
pretty_ts("C6H6", "Box-Cox transformed Benzene", type = "partial", lag_max = 72)
# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>%
mutate(BC_C6H6 = box_cox_vec(C6H6, silent = TRUE),
dBC_C6H6 = difference(BC_C6H6, 24)) %>%
features(dBC_C6H6, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
# Difference and see if stationary
train_ts %>%
mutate(C6H6 = difference(box_cox_vec(C6H6, silent = TRUE), 24)) %>%
pretty_ts("C6H6", "Diff_24 Box-Cox transformed Benzene", type = "partial", lag_max = 72)
Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS Prophet and TSLM.
# Look at PACF => p = 2
# Look at PACF_24 => P = 3
for (sp in enumerate(cvsplits$splits)) {
trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
tstdat <- testing(sp[[2]]) %>% as_tsibble(index = Date)
lam <- auto_lambda(trndat$C6H6)
models_C6H6 <- trndat %>%
model(
manual_arima = ARIMA(box_cox(C6H6, lam) ~ 1 + pdq(2, 0, 0) +
PDQ(3, 1, 0, period = 24)),
auto_arima = ARIMA(box_cox(C6H6, lam) ~ PDQ(period = 24)),
prophet = prophet(box_cox(C6H6, lam) ~ season(period = 24)),
auto_ets = ETS(box_cox(C6H6, lam) ~ season(period = 24)),
auto_tslm = TSLM(box_cox(C6H6, lam) ~ trend() +
fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
print(paste("The best model for split", sp$ix, "is",
models_C6H6$.model[which.min(models_C6H6$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is prophet"
## [1] "The best model for split 3 is prophet"
We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. As with CO, the forecast captures the tendency but not the correct oscillatory behaviour
lam_C6H6 <- auto_lambda(train_ts$C6H6)
model_C6H6 <- train_ts %>%
model(prophet(box_cox(C6H6, lam_C6H6) ~ season(period = 24)))
model_CO %>%
residuals() %>%
pretty_resid("CO", title = "Residuals for Benzene [μg/m³]")
train_ts %>%
filter(year(Date) == 2005) %>%
autoplot(C6H6) +
autolayer(forecast(model_C6H6, test_ts), C6H6, color = colors["C6H6"]) +
autolayer(test_ts, C6H6, color = "#838B8B", alpha = 0.75) +
labs(x = "", y = "Benzene [μg/m³]")
This may be the variable with more heteroskedasticity among the batch. Hence we Box-Cox transform in an attempt to at least mitigate some of the unevenness. In contrast with the previous gases, here the utility of the transformation is critical in getting a somewhat homoskedastic time series.
train_ts %>% compare_box_cox("NOy", "Nitrogen Oxides atmosferic concentration [ppb]")
If we plot the ACF of the transformed data, we see justification for the seasonal differentiation.
train_ts %>%
mutate(NOy = box_cox_vec(NOy, silent = TRUE)) %>%
pretty_ts("NOy", "Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)
# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>%
mutate(BC_NOy = box_cox_vec(NOy, silent = TRUE),
dBC_NOy = difference(BC_NOy, 24)) %>%
features(dBC_NOy, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
# Difference and see if stationary
train_ts %>%
mutate(NOy = difference(box_cox_vec(NOy, silent = TRUE), 24)) %>%
pretty_ts("NOy", "Diff_24 Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)
Looking at the ACF and PACF we decide that an \(ARIMA(3,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.
# Look at PACF => p = 3
# Look at PACF_34 => P = 3
for (sp in enumerate(cvsplits$splits)) {
trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
tstdat <- testing(sp[[2]]) %>% as_tsibble(index = Date)
lam <- auto_lambda(trndat$NOy)
models_NOy <- trndat %>%
model(
manual_arima = ARIMA(box_cox(NOy, lam) ~ 1 + pdq(3, 0, 0) + PDQ(3, 1, 0, period = 24)),
auto_arima = ARIMA(box_cox(NOy, lam) ~ PDQ(period = 24)),
prophet = prophet(box_cox(NOy, lam) ~ season(period = 24)),
auto_ets = ETS(box_cox(NOy, lam) ~ season(period = 24)),
auto_tslm = TSLM(box_cox(NOy, lam) ~ trend() +
fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
print(paste("The best model for split", sp$ix, "is",
models_NOy$.model[which.min(models_NOy$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is prophet"
## [1] "The best model for split 3 is prophet"
We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. On the performance in the test partition, the same conclusions extracted before apply here.
lam_NOy <- auto_lambda(train_ts$NOy)
model_NOy <- train_ts %>%
model(prophet(box_cox(NOy, lam_NOy) ~ season(period = 24)))
model_NOy %>%
residuals() %>%
pretty_resid("NOy", title = "Residuals for Nitrogen Oxides [ppb]")
train_ts %>%
filter(year(Date) == 2005) %>%
autoplot(NOy) +
autolayer(forecast(model_NOy, test_ts), NOy, color = colors["NOy"]) +
autolayer(test_ts, NOy, color = "#838B8B", alpha = 0.75) +
labs(x = "", y = "Nitrogen Oxides [ppb]")
Different to the rest of the polluting agents, this one does not seem to require of a transformation. However, this transformation will ensure the positivity of all predicted values in the future, which is desirable. Hence we apply it here also.
train_ts %>% compare_box_cox("NO2", "Nitrogen Dioxide atmosferic concentration [μg/m³]")
Looking at the ACF in the previous section, we note that there is a strong daily seasonality. Hence we differentiate with 24 lags.
If we plot the ACF of the transformed data, we see justification for the seasonal differentiation.
train_ts %>%
mutate(NO2 = box_cox_vec(NO2, silent = TRUE)) %>%
pretty_ts("NO2", "Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)
# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>%
mutate(BC_NO2 = box_cox_vec(NO2, silent = TRUE),
dBC_NO2 = difference(BC_NO2, 24)) %>%
features(dBC_NO2, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
# Difference and see if stationary
train_ts %>%
mutate(NO2 = difference(box_cox_vec(NO2, silent = TRUE), 24)) %>%
pretty_ts("NO2", "Diff_24 Box-Cox transformed Nitrogen Dioxide",
type = "partial", lag_max = 72)
Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(0,1,3)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.
# Look at PACF => p = 1
# Look at ACF_24 => Q = 3
for (sp in enumerate(cvsplits$splits)) {
trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
tstdat <- testing(sp[[2]]) %>% as_tsibble(index = Date)
lam <- auto_lambda(trndat$NO2)
models_NO2 <- trndat %>%
model(
manual_arima = ARIMA(box_cox(NO2, lam) ~ 1 + pdq(2, 0, 0) + PDQ(0, 1, 3, period = 24)),
auto_arima = ARIMA(box_cox(NO2, lam) ~ PDQ(period = 24)),
prophet = prophet(box_cox(NO2, lam) ~ season(period = 24)),
auto_ets = ETS(box_cox(NO2, lam) ~ season(period = 24)),
auto_tslm = TSLM(box_cox(NO2, lam) ~ trend() +
fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
print(paste("The best model for split", sp$ix, "is",
models_NO2$.model[which.min(models_NO2$RMSE)]))
}
## [1] "The best model for split 1 is manual_arima"
## [1] "The best model for split 2 is auto_arima"
## [1] "The best model for split 3 is auto_arima"
We see that the residuals follow a normal distribution and a white-noise like waveform. In contrast to previous series, here the ACF does resemble much closer the one expected from a white-noise signal. On the other hand, when comparing with the test partition, we see very wide confidence intervals which result in a much less precise model.
lam_NO2 <- auto_lambda(train_ts$NO2)
model_NO2 <- train_ts %>%
model(ARIMA(box_cox(NO2, lam) ~ PDQ(period = 24)))
model_NO2 %>%
residuals() %>%
pretty_resid("NO2", title = "Residuals for Nitrogen Dioxide [μg/m³]")
train_ts %>%
filter(year(Date) == 2005) %>%
autoplot(NO2) +
autolayer(forecast(model_NO2, test_ts), NO2, color = colors["NO2"]) +
autolayer(test_ts, NO2, color = "#838B8B", alpha = 0.75) +
labs(x = "", y = "Nitrogen Dioxide [μg/m³]") +
coord_cartesian(ylim = c(NA, 500))
We observe that for all the polluting agents the best model overall was Prophet except for NO2 where automatic ARIMA performed best. Hence, we will use it in the following section to forecast the concentration of the various gases in order to test their effect on the temperature. For now, let us move on to temperature and how a univariate and multivariate approach on its forecast may behave.
Now we are tasked to find a good model to describe the temperature as a function of the different polluting agents. We will attempt this by first finding a suitable model for the temperature just as we did for the gases. Then, we will play with the number of predictors to include in the model. At last, we will consider the 6-month forecast as we did for pollutants.
Let us start by finding the model. We do not believe that a Box-Cox transformation is needed given the apparent homoskedasticity of the data. Now we examine the time series to create an appropriate ARIMA model. For this purpose, we are going to apply a 24-hour differentiation. Then, we check and see using unitroot tests that a normal difference is not needed.
# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>%
mutate(dTemp = difference(Temp, 24)) %>%
features(dTemp, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
# Difference and see if stationary
train_ts %>%
mutate(Temp = difference(Temp, 24)) %>%
pretty_ts("Temp", "Diff_24 Temperature", type = "partial", lag_max = 72)
Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(2,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.
# Look at PACF => p = 2
# Look at PACF_24 => P = 2
for (sp in enumerate(cvsplits$splits)) {
trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
tstdat <- testing(sp[[2]]) %>% as_tsibble(index = Date)
models_Temp <- trndat %>%
model(
manual_arima = ARIMA(Temp ~ 1 + pdq(2, 0, 0) + PDQ(2, 1, 0, period = 24)),
auto_arima = ARIMA(Temp ~ PDQ(period = 24)),
prophet = prophet(Temp ~ season(period = 24)),
auto_ets = ETS(Temp ~ season(period = 24)),
auto_tslm = TSLM(Temp ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
print(paste("The best model for split", sp$ix, "is",
models_Temp$.model[which.min(models_Temp$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is auto_arima"
## [1] "The best model for split 3 is prophet"
Let us proceed as usual and look at residuals and performance in the test partition. Residuals exhibit the same behaviour as we noted with the polluting agents, with a normal distribution zero-centred but very high correlations as noted in the ACF. For the performance in the test partition, we note a very poor prediction which, although not bad for the first month, quickly starts falling below the actual recorded data. This is a sign that the model is not able to see the whole yearly seasonality, which is not too rare considering that we only have one full year of data, hence less in the train partition.
model_Temp <- train_ts %>%
model(prophet(Temp ~ season(period = 24)))
model_Temp %>%
residuals() %>%
pretty_resid("Temp", title = "Residuals for Temperature")
train_ts %>%
filter(year(Date) == 2005) %>%
autoplot(Temp) +
autolayer(forecast(model_Temp, test_ts), Temp, colour = colors["Temp"]) +
autolayer(test_ts, Temp, color = "#838B8B", alpha = 0.75) +
labs(x = "", y = "Temperature [°C]")
Without further ado, let us move directly to the last point of this whole section, the multivariate forecast on the temperature.
We are now going to consider adding the gases to the model of the temperature. For this, we will compare two models which work very well and with good stability when adding predictors 3: TSLM and VAR. Note that the VAR model is able to predict not only for the temperature but also for all the agents at once. Nonetheless, we are interested in the temperature as we expect the presence of one concrete agent not to have a big impact on another 4. Hence we will only compare the performance for the temperature.
Let us start by determining the TSLM to compare by engaging in a manual iterative process removing predictors which does not happen to be related with the temperature. We consider in first approximation all the polluting agents at the same moment and with one day retarded effect.
# All
predict_model <- train_ts %>%
model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3) +
CO + C6H6 + NOy + NO2 +
lag(CO, 24) + lag(C6H6, 24) + lag(NOy, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.46016 -2.11269 -0.02342 2.23941 12.77775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.0125332 0.7104920 38.019 < 2e-16
## trend() -0.0025845 0.0001873 -13.795 < 2e-16
## fourier(period = "1 day", K = 3)C1_24 -2.5552593 0.0596198 -42.859 < 2e-16
## fourier(period = "1 day", K = 3)S1_24 -2.9253156 0.0650771 -44.952 < 2e-16
## fourier(period = "1 day", K = 3)C2_24 0.9931345 0.0565636 17.558 < 2e-16
## fourier(period = "1 day", K = 3)S2_24 1.1763642 0.0600489 19.590 < 2e-16
## fourier(period = "1 day", K = 3)C3_24 -0.2672563 0.0557770 -4.792 1.69e-06
## fourier(period = "1 day", K = 3)S3_24 0.1400547 0.0563092 2.487 0.01290
## fourier(period = "1 year", K = 3)C1_8766 -2.8549810 0.4424449 -6.453 1.17e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.1663552 0.2476783 -32.972 < 2e-16
## fourier(period = "1 year", K = 3)C2_8766 3.1586629 0.1787405 17.672 < 2e-16
## fourier(period = "1 year", K = 3)S2_8766 2.0487480 0.1358172 15.085 < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0520145 0.0725616 -0.717 0.47350
## fourier(period = "1 year", K = 3)S3_8766 0.9252624 0.1092749 8.467 < 2e-16
## CO 0.1873384 0.0617437 3.034 0.00242
## C6H6 0.1506110 0.0100857 14.933 < 2e-16
## NOy -0.0070573 0.0005080 -13.893 < 2e-16
## NO2 0.0001194 0.0018826 0.063 0.94943
## lag(CO, 24) 0.1041761 0.0619981 1.680 0.09294
## lag(C6H6, 24) 0.0503747 0.0101037 4.986 6.31e-07
## lag(NOy, 24) -0.0002720 0.0005087 -0.535 0.59285
## lag(NO2, 24) -0.0048802 0.0018770 -2.600 0.00934
##
## (Intercept) ***
## trend() ***
## fourier(period = "1 day", K = 3)C1_24 ***
## fourier(period = "1 day", K = 3)S1_24 ***
## fourier(period = "1 day", K = 3)C2_24 ***
## fourier(period = "1 day", K = 3)S2_24 ***
## fourier(period = "1 day", K = 3)C3_24 ***
## fourier(period = "1 day", K = 3)S3_24 *
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO **
## C6H6 ***
## NOy ***
## NO2
## lag(CO, 24) .
## lag(C6H6, 24) ***
## lag(NOy, 24)
## lag(NO2, 24) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.337 on 7298 degrees of freedom
## Multiple R-squared: 0.854, Adjusted R-squared: 0.8536
## F-statistic: 2033 on 21 and 7298 DF, p-value: < 2.22e-16
# NO2 out
predict_model <- train_ts %>%
model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3) +
CO + C6H6 + NOy +
lag(CO, 24) + lag(C6H6, 24) + lag(NOy, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4566 -2.1120 -0.0238 2.2399 12.7771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.0153962 0.7090081 38.103 < 2e-16
## trend() -0.0025841 0.0001872 -13.802 < 2e-16
## fourier(period = "1 day", K = 3)C1_24 -2.5556732 0.0592576 -43.128 < 2e-16
## fourier(period = "1 day", K = 3)S1_24 -2.9260439 0.0640515 -45.683 < 2e-16
## fourier(period = "1 day", K = 3)C2_24 0.9933250 0.0564799 17.587 < 2e-16
## fourier(period = "1 day", K = 3)S2_24 1.1760777 0.0598747 19.642 < 2e-16
## fourier(period = "1 day", K = 3)C3_24 -0.2672110 0.0557686 -4.791 1.69e-06
## fourier(period = "1 day", K = 3)S3_24 0.1401956 0.0562616 2.492 0.01273
## fourier(period = "1 year", K = 3)C1_8766 -2.8555317 0.4423295 -6.456 1.15e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.1651412 0.2469207 -33.068 < 2e-16
## fourier(period = "1 year", K = 3)C2_8766 3.1589507 0.1786707 17.680 < 2e-16
## fourier(period = "1 year", K = 3)S2_8766 2.0491144 0.1356850 15.102 < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0522345 0.0724737 -0.721 0.47109
## fourier(period = "1 year", K = 3)S3_8766 0.9252629 0.1092674 8.468 < 2e-16
## CO 0.1887034 0.0578669 3.261 0.00112
## C6H6 0.1506516 0.0100647 14.968 < 2e-16
## NOy -0.0070488 0.0004898 -14.392 < 2e-16
## lag(CO, 24) 0.1037752 0.0616709 1.683 0.09247
## lag(C6H6, 24) 0.0503104 0.0100521 5.005 5.72e-07
## lag(NOy, 24) -0.0002762 0.0005044 -0.548 0.58401
## lag(NO2, 24) -0.0048267 0.0016759 -2.880 0.00399
##
## (Intercept) ***
## trend() ***
## fourier(period = "1 day", K = 3)C1_24 ***
## fourier(period = "1 day", K = 3)S1_24 ***
## fourier(period = "1 day", K = 3)C2_24 ***
## fourier(period = "1 day", K = 3)S2_24 ***
## fourier(period = "1 day", K = 3)C3_24 ***
## fourier(period = "1 day", K = 3)S3_24 *
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO **
## C6H6 ***
## NOy ***
## lag(CO, 24) .
## lag(C6H6, 24) ***
## lag(NOy, 24)
## lag(NO2, 24) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.337 on 7299 degrees of freedom
## Multiple R-squared: 0.854, Adjusted R-squared: 0.8536
## F-statistic: 2135 on 20 and 7299 DF, p-value: < 2.22e-16
# lag NOy out
predict_model <- train_ts %>%
model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 3) +
CO + C6H6 + NOy +
lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.44237 -2.11714 -0.02053 2.24370 12.76143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.0385857 0.7077082 38.206 < 2e-16
## trend() -0.0025857 0.0001872 -13.813 < 2e-16
## fourier(period = "1 day", K = 3)C1_24 -2.5562688 0.0592447 -43.148 < 2e-16
## fourier(period = "1 day", K = 3)S1_24 -2.9315717 0.0632479 -46.350 < 2e-16
## fourier(period = "1 day", K = 3)C2_24 0.9923391 0.0564485 17.580 < 2e-16
## fourier(period = "1 day", K = 3)S2_24 1.1745608 0.0598077 19.639 < 2e-16
## fourier(period = "1 day", K = 3)C3_24 -0.2666500 0.0557565 -4.782 1.77e-06
## fourier(period = "1 day", K = 3)S3_24 0.1408785 0.0562450 2.505 0.012276
## fourier(period = "1 year", K = 3)C1_8766 -2.8664126 0.4418617 -6.487 9.32e-11
## fourier(period = "1 year", K = 3)S1_8766 -8.1570794 0.2464696 -33.096 < 2e-16
## fourier(period = "1 year", K = 3)C2_8766 3.1589713 0.1786621 17.681 < 2e-16
## fourier(period = "1 year", K = 3)S2_8766 2.0536349 0.1354271 15.164 < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0537652 0.0724163 -0.742 0.457841
## fourier(period = "1 year", K = 3)S3_8766 0.9252416 0.1092622 8.468 < 2e-16
## CO 0.1970763 0.0558073 3.531 0.000416
## C6H6 0.1512756 0.0099995 15.128 < 2e-16
## NOy -0.0071642 0.0004420 -16.207 < 2e-16
## lag(CO, 24) 0.0896344 0.0560005 1.601 0.109509
## lag(C6H6, 24) 0.0486305 0.0095720 5.080 3.86e-07
## lag(NO2, 24) -0.0050431 0.0016286 -3.097 0.001965
##
## (Intercept) ***
## trend() ***
## fourier(period = "1 day", K = 3)C1_24 ***
## fourier(period = "1 day", K = 3)S1_24 ***
## fourier(period = "1 day", K = 3)C2_24 ***
## fourier(period = "1 day", K = 3)S2_24 ***
## fourier(period = "1 day", K = 3)C3_24 ***
## fourier(period = "1 day", K = 3)S3_24 *
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO ***
## C6H6 ***
## NOy ***
## lag(CO, 24)
## lag(C6H6, 24) ***
## lag(NO2, 24) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.337 on 7300 degrees of freedom
## Multiple R-squared: 0.854, Adjusted R-squared: 0.8536
## F-statistic: 2248 on 19 and 7300 DF, p-value: < 2.22e-16
# fourier K = 2
predict_model <- train_ts %>%
model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 2) +
CO + C6H6 + NOy +
lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.823705 -2.130351 0.007951 2.233559 13.167983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.199e+01 3.737e-01 58.848 < 2e-16
## trend() -1.230e-03 9.666e-05 -12.725 < 2e-16
## fourier(period = "1 day", K = 3)C1_24 -2.544e+00 5.949e-02 -42.753 < 2e-16
## fourier(period = "1 day", K = 3)S1_24 -2.910e+00 6.345e-02 -45.869 < 2e-16
## fourier(period = "1 day", K = 3)C2_24 9.981e-01 5.671e-02 17.598 < 2e-16
## fourier(period = "1 day", K = 3)S2_24 1.193e+00 6.004e-02 19.873 < 2e-16
## fourier(period = "1 day", K = 3)C3_24 -2.692e-01 5.602e-02 -4.805 1.58e-06
## fourier(period = "1 day", K = 3)S3_24 1.311e-01 5.650e-02 2.320 0.020346
## fourier(period = "1 year", K = 2)C1_8766 -6.031e+00 2.310e-01 -26.110 < 2e-16
## fourier(period = "1 year", K = 2)S1_8766 -6.442e+00 1.380e-01 -46.694 < 2e-16
## fourier(period = "1 year", K = 2)C2_8766 1.903e+00 9.930e-02 19.163 < 2e-16
## fourier(period = "1 year", K = 2)S2_8766 1.248e+00 8.569e-02 14.560 < 2e-16
## CO 2.109e-01 5.602e-02 3.764 0.000169
## C6H6 1.516e-01 1.005e-02 15.089 < 2e-16
## NOy -7.147e-03 4.438e-04 -16.104 < 2e-16
## lag(CO, 24) 1.172e-01 5.614e-02 2.087 0.036909
## lag(C6H6, 24) 4.791e-02 9.617e-03 4.982 6.44e-07
## lag(NO2, 24) -5.156e-03 1.631e-03 -3.161 0.001578
##
## (Intercept) ***
## trend() ***
## fourier(period = "1 day", K = 3)C1_24 ***
## fourier(period = "1 day", K = 3)S1_24 ***
## fourier(period = "1 day", K = 3)C2_24 ***
## fourier(period = "1 day", K = 3)S2_24 ***
## fourier(period = "1 day", K = 3)C3_24 ***
## fourier(period = "1 day", K = 3)S3_24 *
## fourier(period = "1 year", K = 2)C1_8766 ***
## fourier(period = "1 year", K = 2)S1_8766 ***
## fourier(period = "1 year", K = 2)C2_8766 ***
## fourier(period = "1 year", K = 2)S2_8766 ***
## CO ***
## C6H6 ***
## NOy ***
## lag(CO, 24) *
## lag(C6H6, 24) ***
## lag(NO2, 24) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.352 on 7302 degrees of freedom
## Multiple R-squared: 0.8526, Adjusted R-squared: 0.8522
## F-statistic: 2484 on 17 and 7302 DF, p-value: < 2.22e-16
We end up with a model that includes the concentrations of Carbon Monoxide, Benzene, Nitrogen Oxides 5 and a one day retarded concentration of Carbon Monoxide, Benzene and Nitrogen Dioxide.
Now we use cross-validation as before to compare this model with the one that VAR may predict for us.
models_TempM <- c("VAR", "TSLM")
for (sp in enumerate(cvsplits$splits)) {
trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
tstdat <- testing(sp[[2]]) %>% as_tsibble(index = Date)
var_model <- trndat %>%
model(VAR(vars(Temp, CO, C6H6, NOy, NO2) ~ AR(24) + AR(8766))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
tslm_model <- trndat %>%
model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 2) +
CO + C6H6 + NOy +
lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24))) %>%
forecast(tstdat) %>%
accuracy(data_ts)
best_TempM <- which.min(c(filter(var_model, .response == "Temp")$RMSE,
tslm_model$RMSE))
print(paste("The best model for split", sp$ix, "is", models_TempM[best_TempM]))
}
## [1] "The best model for split 1 is TSLM"
## [1] "The best model for split 2 is TSLM"
## [1] "The best model for split 3 is VAR"
Now that we have checked that the best model for the temperature is the considered TSLM, we follow by looking at the residuals of the model, which again follow the exact same behaviour we observed for all previous series. The performance on the test partition is kind of similar to the one we observed when no predictors were considered. However, this time there is an slight bend upwards which indicates that the model has captured better the tendency. This gives us hopes for the future forecasts.
model_TempM <- train_ts %>%
model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
fourier(period = "1 year", K = 2) +
CO + C6H6 + NOy +
lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
model_TempM %>%
residuals() %>%
pretty_resid("TempM", title = "Residuals for Temperature (multi)")
train_ts %>%
filter(year(Date) == 2005) %>%
autoplot(Temp) +
autolayer(forecast(model_TempM, test_ts), TempM, colour = colors["TempM"]) +
autolayer(test_ts, TempM, color = "#838B8B", alpha = 0.75) +
labs(x = "", y = "Temperature (multi) [°C]")
Finally we end all the work done in this section by using the selected models in a six-month forecast of both, the polluting agents and the two models for the temperature. We note that all of the predictions look reasonable in general terms, especially on the first few months of the forecast. Later, the confidence intervals either explode, giving us little to none information, or the forecasts reach unphysical results.
The latter observation is more prominent for the univariate temperature model. On the other hand, the multivariate model encapsulates the yearly seasonality and provides well reasonable and stable prediction intervals. Hence we claim that by accounting for the possible effects of the different polluting agents, we end with a very reasonable model for future temperature forecast.
# Box-Cox lambdas
lam_CO <- auto_lambda(data_ts$CO)
lam_C6H6 <- auto_lambda(data_ts$C6H6)
lam_NOy <- auto_lambda(data_ts$NOy)
lam_NO2 <- auto_lambda(data_ts$NO2)
# Forecast of univariate models
forecast_CO <- data_ts %>%
model(prophet(box_cox(CO, lam_CO) ~ season(period = 24))) %>%
forecast(h = "6 months")
forecast_C6H6 <- data_ts %>%
model(prophet(box_cox(C6H6, lam_C6H6) ~ season(period = 24))) %>%
forecast(h = "6 months")
forecast_NOy <- data_ts %>%
model(prophet(box_cox(NOy, lam_NOy) ~ season(period = 24))) %>%
forecast(h = "6 months")
forecast_NO2 <- data_ts %>%
model(ARIMA(box_cox(NO2, lam_NO2) ~ PDQ(period = 24))) %>%
forecast(h = "6 months")
forecast_Temp <- data_ts %>%
model(prophet(Temp ~ season(period = 24))) %>%
forecast(h = "6 months")
# Forecast of multivariate model
future_data <- new_data(data_ts, n = nrow(forecast_Temp)) %>%
mutate(CO = forecast_CO$.mean, C6H6 = forecast_C6H6$.mean,
NOy = forecast_NOy$.mean, NO2 = forecast_NO2$.mean)
forecast_TempM <- data_ts %>%
model(TSLM(TempM ~ trend() + fourier(period = 24, K = 3) +
fourier(period = "1 year", K = 2) +
CO + C6H6 + NOy +
lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24))) %>%
forecast(future_data)
# Plot all of them
p1 <- data_ts %>%
autoplot(CO) +
autolayer(forecast_CO, CO, color = colors["CO"]) +
labs(x = "", y = "Carbon Monoxide [mg/m³]")
p2 <- data_ts %>%
autoplot(C6H6) +
autolayer(forecast_C6H6, C6H6, color = colors["C6H6"]) +
labs(x = "", y = "Benzene [μg/m³]")
p3 <- data_ts %>%
autoplot(NOy) +
autolayer(forecast_NOy, NOy, color = colors["NOy"]) +
labs(x = "", y = "Nitrogen Oxides [ppb]") +
coord_cartesian(ylim = c(NA, 1500))
p4 <- data_ts %>%
autoplot(NO2) +
autolayer(forecast_NO2, NO2, color = colors["NO2"]) +
labs(x = "", y = "Nitrogen Dioxide [μg/m³]") +
coord_cartesian(ylim = c(0, 500))
p5 <- data_ts %>%
autoplot(Temp) +
autolayer(forecast_Temp, Temp, color = colors["Temp"]) +
labs(x = "", y = "Temperature [°C]")
p6 <- data_ts %>%
autoplot(TempM) +
autolayer(forecast_TempM, TempM, color = colors["TempM"]) +
labs(x = "", y = "Temperature (multi) [°C]")
p1 / p2
p3 / p4
p5 / p6
Remember we are dealing with data coming from an Italian city.↩︎
The reason why we use “UTC+1” and not “CET” is that the data does not account for seasonal time shift for energy saving so neither should we.↩︎
Inner testing showed that Prophet, ETS and ARIMA tend to give null models frequently when adding predictors. Moreover, the two model considered here offer a very neat way of knowing whether the considered variables are indeed related with the temperature or, as in the case of VAR, provide a feature selection on in itself.↩︎
This is not to mean that concentrations are not going to be correlated, which they are going to be, but rather that the high concentration of one agent should not produce an effect on the rest, as the relation should lie in both having the same emit channels.↩︎
Remember that this variable excludes (to a first approximation) the contributions due to Nitrogen Dioxide.↩︎